In [1]:
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-1-1f42eb16d431> in <module>()
----> 1 from setup import *
      2 import sys
      3 if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
      4 from constants import *

/home/hobs/src/AgileMachineLearning/docs/nlp/notebooks/setup.py in <module>()
    685 POS_LABELS = {'0': '1', 'False': 'True', 'F': 'T', 'No': 'Yes', 'N': 'P',
    686               'None': 'Positive', 'Neg': 'Pos', 'Negative': 'Positive', "A": "B"}
--> 687 POS_LABELS_INVERSE = dict((v, k) for k, v in POS_LABELS.iteritems())
    688 POS_LABELS_LOWER = dict((k.lower(), v.lower()) for k, v in POS_LABELS.iteritems())
    689 POS_LABELS_LOWER_INVERSE = dict((v.lower(), k.lower()) for k, v in POS_LABELS.iteritems())

AttributeError: 'dict' object has no attribute 'iteritems'

In [3]:
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 300)
pd.set_option('precision', 2)
%pprint


Pretty printing has been turned ON

In [4]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [5]:
lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi'))
lsi4 = LsiModel.load(os.path.join(DATA_PATH, 'lsi4'))
lsi.num_topics
lsi.show_topics(1,10)


Out[5]:
[(0,
  '0.657*"Python" + 0.378*"RT" + 0.243*"to" + 0.241*"a" + 0.222*"in" + 0.196*"python" + 0.178*"the" + 0.163*"and" + 0.146*"for" + 0.129*"I"')]

In [17]:
# favorites
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'))
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'))

vocab = Dictionary(df.txt.str.split())

tfidf = TfidfModel(id2word=vocab, dictionary=vocab)

lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi'))

# Bags of words
df = pd.DataFrame.from_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
bows = pd.Series(vocab.doc2bow(toks) for toks in df.txt.str.split())

In [40]:
topics = lsi[tfidf[bows]]
topics = pd.DataFrame((dict(top) for top in topics), index=df.index)
print(topics.isnull().sum())
topics.fillna(0, inplace=True)
print(topics.isnull().sum())


0     23
1     23
2     22
      ..
97     5
98     5
99     3
dtype: int64
0     0
1     0
2     0
     ..
97    0
98    0
99    0
dtype: int64

In [41]:
topics.copy().round(2)


Out[41]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
0 0.12 0.32 -0.10 -0.19 0.01 -0.02 0.25 -0.09 0.13 0.01 0.08 0.04 -0.08 0.04 0.05 -0.04 0.00 0.02 0.03 0.15 0.11 -0.03 -0.07 0.04 0.01 -0.02 0.01 0.00 0.02 -0.04 -0.02 0.01 0.02 0.00 0.02 0.04 0.01 0.01 -0.04 -0.01 0.00 0.01 0.00 -0.02 0.02 -0.01 0.01 -0.00 0.02 0.02 0.03 -0.00 -0.02 -0.00 0.01 -0.01 -0.03 0.02 0.01 -0.00 0.02 0.01 0.01 -0.01 -0.01 0.00 0.01 -0.00 0.01 0.02 0.00 0.01 -0.00 -0.01 -0.01 0.00 0.00 0.00 0.00 0.02 0.00 0.01 0.00 0.00 0.00 -0.02 0.03 0.03 0.01 -0.01 0.00 -0.00 0.02 -0.01 0.03 -0.01 0.02 0.00 0.01 -0.01
1 0.41 -0.36 0.09 0.04 -0.02 -0.00 0.11 -0.08 0.03 -0.01 0.16 0.04 -0.11 0.01 0.04 -0.05 -0.05 -0.00 -0.06 0.02 0.05 -0.02 -0.03 0.02 -0.05 -0.01 0.02 -0.01 -0.02 -0.00 0.02 0.04 -0.01 -0.01 -0.02 0.03 0.01 0.01 -0.01 -0.00 0.00 0.03 0.03 -0.05 0.02 -0.01 0.02 -0.02 0.02 0.01 0.01 0.00 -0.03 0.02 -0.01 -0.05 -0.01 0.02 0.01 0.03 0.00 0.03 -0.06 -0.01 -0.03 -0.04 0.01 -0.02 -0.01 -0.01 -0.05 0.00 -0.05 -0.02 -0.00 -0.03 -0.06 -0.01 0.03 0.04 -0.01 0.02 -0.02 -0.05 0.10 0.00 0.02 -0.05 0.01 -0.02 0.03 -0.00 0.05 0.02 0.03 -0.02 -0.01 0.05 0.02 -0.01
2 0.03 -0.00 0.00 0.01 -0.02 -0.01 0.01 -0.01 0.01 0.00 0.03 0.00 -0.03 -0.02 0.01 -0.00 -0.01 0.02 -0.03 -0.02 -0.00 -0.01 0.01 0.07 -0.01 0.00 -0.03 -0.01 0.07 -0.07 -0.02 -0.05 0.04 0.08 -0.01 -0.09 -0.02 -0.00 -0.06 -0.01 0.03 0.01 0.01 -0.20 -0.09 0.10 0.05 -0.24 -0.32 -0.18 0.01 -0.04 -0.05 -0.05 -0.07 0.03 -0.02 0.01 0.09 -0.02 -0.11 0.02 0.15 -0.02 -0.02 -0.11 -0.06 0.03 -0.06 0.01 -0.01 -0.01 -0.08 0.00 -0.01 -0.00 -0.05 0.03 -0.02 -0.02 0.00 -0.01 0.00 -0.04 0.02 0.01 -0.04 -0.04 0.01 0.06 -0.03 -0.04 -0.00 0.01 -0.04 0.01 0.01 0.04 -0.02 0.03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
183067 0.03 0.01 -0.02 0.01 -0.03 -0.01 -0.01 -0.07 0.05 -0.04 0.21 -0.09 0.35 0.07 -0.11 0.06 0.03 -0.07 -0.00 0.02 -0.04 -0.04 -0.03 0.01 -0.00 -0.03 0.02 0.01 0.01 -0.00 0.00 0.01 0.00 0.01 0.01 0.00 -0.01 -0.01 -0.00 -0.01 0.01 -0.01 -0.01 -0.01 -0.00 0.01 0.01 0.00 -0.01 0.00 -0.00 -0.00 0.01 -0.01 0.02 -0.01 0.01 0.02 0.00 -0.01 -0.00 0.00 -0.00 -0.00 -0.01 -0.01 0.01 0.01 -0.01 -0.02 -0.00 -0.01 -0.00 0.01 -0.00 0.01 -0.00 0.01 0.01 0.03 0.01 0.01 0.01 0.02 0.00 0.01 0.01 -0.01 0.00 -0.01 -0.01 -0.01 0.01 -0.00 0.01 0.00 0.00 0.00 -0.01 0.01
183068 0.41 -0.36 0.09 0.04 -0.02 -0.00 0.11 -0.08 0.03 -0.01 0.16 0.04 -0.11 0.01 0.04 -0.05 -0.05 -0.00 -0.06 0.02 0.05 -0.02 -0.03 0.02 -0.05 -0.01 0.02 -0.01 -0.02 -0.00 0.02 0.04 -0.01 -0.01 -0.02 0.03 0.01 0.01 -0.01 -0.00 0.00 0.03 0.03 -0.05 0.02 -0.01 0.02 -0.02 0.02 0.01 0.01 0.00 -0.03 0.02 -0.01 -0.05 -0.01 0.02 0.01 0.03 0.00 0.03 -0.06 -0.01 -0.03 -0.04 0.01 -0.02 -0.01 -0.01 -0.05 0.00 -0.05 -0.02 -0.00 -0.03 -0.06 -0.01 0.03 0.04 -0.01 0.02 -0.02 -0.05 0.10 0.00 0.02 -0.05 0.01 -0.02 0.03 -0.00 0.05 0.02 0.03 -0.02 -0.01 0.05 0.02 -0.01
183069 0.00 -0.00 0.00 -0.00 -0.00 -0.00 0.00 -0.00 0.00 -0.00 -0.00 -0.00 0.00 -0.00 -0.00 0.00 -0.00 -0.00 0.00 0.00 -0.00 -0.00 -0.00 -0.00 -0.00 -0.00 0.00 0.00 0.00 -0.00 0.00 -0.00 -0.00 0.00 0.00 0.00 -0.00 0.00 -0.00 -0.00 0.00 0.00 0.00 0.00 0.00 -0.00 -0.00 -0.00 -0.01 0.00 -0.01 -0.00 0.00 -0.00 0.00 -0.00 0.00 0.00 -0.00 -0.01 -0.01 0.00 -0.01 0.00 -0.02 -0.00 0.01 0.01 0.01 -0.00 -0.00 -0.01 -0.00 0.01 0.01 0.01 0.01 0.00 -0.01 0.01 0.01 0.01 0.00 -0.01 0.01 -0.02 0.00 -0.00 -0.00 -0.01 0.00 0.01 -0.01 -0.01 0.00 0.01 0.00 0.00 -0.00 0.00

183070 rows × 100 columns

When I first ran this, my dataframes weren't "aligned".
So it's very important to check your datasets after every load.
The correspondence between dates and topics and numerical features is critical for training!


In [42]:
print(len(dates))
print(len(topics))
print(len(nums))


183070
183070
183070

In [43]:
sum(nums.index == dates.index) == len(dates)


Out[43]:
True

In [44]:
sum(nums.index == topics.index) == len(dates)


Out[44]:
True

In [45]:
disc = LinearDiscriminantAnalysis()
disc


Out[45]:
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [46]:
category = (np.ceil(nums.favorite_count ** .13)).astype(np.int8)

In [47]:
disc = LinearDiscriminantAnalysis().fit(topics, category)

In [50]:
predicted_favorites = disc.predict(topics)
predicted_favorites[:100]


Out[50]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

In [52]:
np.sum(predicted_favorites > 0)


Out[52]:
2144

Wow!

DiscriminantAnalysis is VERY discriminating!


In [163]:
np.sum(nums.favorite_count >= 1)


Out[163]:
33292

But not in a good way.
10x more true favorites than predicted. Our unbalanced training set makes it easy for the judge to be tough.
Let's mellow our judge a bit...


In [164]:


In [165]:
results = pd.DataFrame()
results['predicted'] = predicted_favorites
results['truth'] = pd.Series(nums.favorite_count >= 1)
conf = Confusion(results)
conf


Out[165]:
truth False True
predicted
False 147725 31117
True 2053 2175

In [166]:
results.predicted.corr(results.truth)


Out[166]:
0.1326

In [167]:
conf.stats_dict


Out[167]:
PrettyDict([('tpr', 0.8260),
            ('fpr', 0.4856),
            ('tnr', 0.5144),
            ('fnr', 0.1740),
            ('plr', 1.7011),
            ('nlr', 0.3382),
            ('accuracy', 0.8188),
            ('sensitivity', 0.8260),
            ('specificity', 0.5144),
            ('mcc', 0.1326),
            ('chi_squared', 3217.3986)])

High accuracy, but low MCC (correlation)

Balance the training?
Get rid of some negatives?
Accentuate the positive? <-- give this a try yourself


In [168]:
pos = np.array(nums.favorite_count >= 1)
neg = ~pos
portion_pos = float(sum(pos)) / len(nums)
mask = ((np.random.binomial(1, portion_pos, size=len(nums)).astype(bool) & neg) | pos)
disc = LinearDiscriminantAnalysis().fit(topics[mask], (nums.favorite_count[mask] >= 1))
print(sum(mask))
print(sum(pos) * 2)


60633
66584

In [169]:
results = pd.DataFrame()
results['predicted'] = disc.predict(topics.values)
results['truth'] = nums.favorite_count.values >= 1
conf = Confusion(results)
conf


Out[169]:
truth False True
predicted
False 79751 1355
True 70027 31937

In [170]:
results.predicted.corr(results.truth)


Out[170]:
0.3819

In [171]:
conf.stats_dict


Out[171]:
PrettyDict([('tpr', 0.9833),
            ('fpr', 0.6868),
            ('tnr', 0.3132),
            ('fnr', 0.0167),
            ('plr', 1.4317),
            ('nlr', 0.0533),
            ('accuracy', 0.6101),
            ('sensitivity', 0.9833),
            ('specificity', 0.3132),
            ('mcc', 1.1766),
            ('chi_squared', 253427.2867)])

So let's add some more negative examples back in.
50x imbalance is defintiely misleading.
But 2-5x imbalance is probably OK.


In [172]:
portion_neg = 3 * portion_pos
mask = ((np.random.binomial(1, portion_neg, size=len(nums)).astype(bool) & neg) | pos)
disc = LinearDiscriminantAnalysis().fit(topics[mask], nums.favorite_count[mask] >=1 )
print(sum(mask))
print(sum(pos) * 2)


114711
66584

In [173]:
results = pd.DataFrame()
results['predicted'] = disc.predict(topics.values)
results['truth'] = nums.favorite_count.values > 0
conf = Confusion(results)
conf


Out[173]:
truth False True
predicted
False 133025 20356
True 16753 12936

At least the confusion matrix looks balanced now


In [174]:
results.predicted.corr(results.truth)


Out[174]:
0.2896

Should have known, imbalance doesn't help...


In [179]:
portion_neg = 2 * portion_pos
mask = ((np.random.binomial(1, portion_neg, size=len(nums)).astype(bool) & neg) | pos)
disc = LinearDiscriminantAnalysis().fit(topics.values[mask], (nums.favorite_count.values > 0)[mask])
print(sum(mask))
print(sum(pos) * 2)


87655
66584

In [180]:
results = pd.DataFrame()
results['predicted'] = disc.predict(topics.values)
results['truth'] = nums.favorite_count.values > 0
conf = Confusion(results)
conf


Out[180]:
truth False True
predicted
False 101101 6434
True 48677 26858

In [181]:
results.predicted.corr(results.truth)


Out[181]:
0.3775

So it looks like 38% correlation is all we can squeeze out of this simple model
Next up... adding number of followers and other features